library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.0.4 ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(here)
here() starts at C:/Users/Lenovo/Documents/RStudioProjects/UFO-Sightings---R-Project
library(withr)
ufo_sightings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/ufo_sightings.csv')
Rows: 96429 Columns: 12── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): city, state, country_code, shape, reported_duration, summary, day_part
dbl (1): duration_seconds
lgl (1): has_images
dttm (2): reported_date_time, reported_date_time_utc
date (1): posted_date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
places <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/places.csv')
Rows: 14417 Columns: 10── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): city, alternate_city_names, state, country, country_code, timezone
dbl (4): latitude, longitude, population, elevation_m
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
day_parts_map <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-06-20/day_parts_map.csv')
Rows: 26409 Columns: 12── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
dbl (2): rounded_lat, rounded_long
date (1): rounded_date
time (9): astronomical_twilight_begin, nautical_twilight_begin, civil_twilight_begin, sunrise, solar_noon, sunset, civil_twilight_end, nauti...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Checking if files have been loaded correctly
head(ufo_sightings)
head(places)
head(day_parts_map)
Saving data in respective files
dir.create(here("data", "2023", "2023-06-20"), recursive = TRUE, showWarnings = FALSE)
write_csv(ufo_sightings, here("data", "2023", "2023-06-20", "ufo_sightings.csv"))
write_csv(places, here("data", "2023", "2023-06-20", "places.csv"))
write_csv(day_parts_map, here("data", "2023", "2023-06-20", "day_parts_map.csv"))
Checking missing data
glimpse(ufo_sightings)
Rows: 96,429
Columns: 12
$ reported_date_time <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-…
$ reported_date_time_utc <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-…
$ posted_date <date> 2022-09-09, 2022-10-08, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 20…
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM…
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US…
$ shape <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ reported_duration <chr> "15 mins\u0085", "1 minute", "2 hours", "30 seconds", "3 minutes", "10 minutes", "20 seconds", "5 minutes", "90…
$ duration_seconds <dbl> 900, 60, 172800, 30, 180, 600, 20, 300, 120, 1800, 10, 3, 45, 60, 240, 32, 300, 600, 180, 1200, 45, 300, 180, 1…
$ summary <chr> "Saw multi color object above horizon.", "An object in the shape of a straight line about an inch from our view…
$ has_images <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,…
$ day_part <chr> "night", "nautical dusk", "night", "afternoon", "night", "morning", "morning", "afternoon", NA, "astronomical d…
glimpse(places)
Rows: 14,417
Columns: 10
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "Gr…
$ alternate_city_names <chr> "Pajnkherst,bynhwrst,pynhwrst karwlynay shmaly,Пајнхерст,بينهورست,پینهورست، کارولینای شمالی", NA, "CLE,Cleavelan…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM",…
$ country <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "Australia", "USA", "USA", "USA", "USA", "USA", "India", …
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US",…
$ latitude <dbl> 35.19543, 44.83445, 41.49950, 39.16533, 33.66946, 35.33951, 37.65042, 41.11760, -33.53233, 44.48912, 44.05368, 33…
$ longitude <dbl> -79.46948, -85.28256, -81.69541, -86.52639, -117.82311, -97.48670, -77.61249, -73.40790, 149.25367, -108.05621, -…
$ timezone <chr> "America/New_York", "America/Detroit", "America/New_York", "America/Indiana/Indianapolis", "America/Los_Angeles",…
$ population <dbl> 15752, 1352, 388072, 84067, 256927, 60451, 24729, 88485, 3355, 1879, 2349, 1608139, 71579, 7912, 197340, 559121, …
$ elevation_m <dbl> 160, 192, 199, 235, 17, 382, 89, 11, NA, 1156, 160, 331, 1, 157, NA, 1511, 414, 2, 393, 89, 1373, 264, 1, 89, 220…
glimpse(day_parts_map)
Rows: 26,409
Columns: 12
$ rounded_lat <dbl> 40, 40, 40, 40, 30, 40, 40, 40, -30, 40, 40, 30, 30, 40, 30, 40, 30, 30, 40, 30, 40, 30, 40, 30, 50, 40, 2…
$ rounded_long <dbl> -80, -90, -80, -90, -120, -100, -80, -70, 150, -110, -70, -110, -80, -120, 80, -110, -120, -80, -100, -120…
$ rounded_date <date> 2022-08-28, 2022-08-21, 2022-08-14, 2022-08-07, 2022-08-07, 2022-07-24, 2022-07-17, 2022-07-17, 2022-07-1…
$ astronomical_twilight_begin <time> 09:07:43, 09:38:33, 08:48:54, 09:19:00, 11:55:02, 09:39:05, 08:09:39, 07:29:37, 19:30:59, 10:01:25, 07:21…
$ nautical_twilight_begin <time> 09:42:49, 10:14:53, 09:26:40, 09:58:23, 12:26:50, 10:22:03, 08:54:27, 08:14:25, 20:00:01, 10:47:51, 08:07…
$ civil_twilight_begin <time> 10:16:18, 10:49:12, 10:01:56, 10:34:41, 12:57:22, 11:00:33, 09:34:01, 08:54:00, 20:29:37, 11:28:21, 08:48…
$ sunrise <time> 10:42:53, 11:16:15, 10:29:32, 11:02:53, 13:21:39, 11:30:01, 10:04:06, 09:24:04, 20:54:22, 11:58:55, 09:18…
$ solar_noon <time> 17:21:10, 18:03:05, 17:24:39, 18:05:45, 20:05:45, 18:46:32, 17:26:12, 16:46:12, 02:05:20, 19:25:26, 16:45…
$ sunset <time> 23:59:27, 00:49:55, 00:19:45, 01:08:37, 02:49:51, 02:03:03, 00:48:18, 00:08:20, 07:16:18, 02:51:58, 00:12…
$ civil_twilight_end <time> 00:26:02, 01:16:58, 00:47:21, 01:36:50, 03:14:07, 02:32:31, 01:18:23, 00:38:24, 07:41:03, 03:22:32, 00:42…
$ nautical_twilight_end <time> 00:59:31, 01:51:17, 01:22:37, 02:13:08, 03:44:39, 03:11:02, 01:57:57, 01:17:59, 08:10:40, 04:03:02, 01:23…
$ astronomical_twilight_end <time> 01:34:37, 02:27:37, 02:00:24, 02:52:31, 04:16:27, 03:54:00, 02:42:45, 02:02:47, 08:39:42, 04:49:28, 02:09…
colSums(is.na(ufo_sightings))
reported_date_time reported_date_time_utc posted_date city state country_code
0 0 0 0 85 0
shape reported_duration duration_seconds summary has_images day_part
2039 0 0 31 0 2563
colSums(is.na(places))
city alternate_city_names state country country_code latitude
0 2953 32 0 0 0
longitude timezone population elevation_m
0 0 0 2285
colSums(is.na(day_parts_map))
rounded_lat rounded_long rounded_date astronomical_twilight_begin nautical_twilight_begin
0 0 0 951 122
civil_twilight_begin sunrise solar_noon sunset civil_twilight_end
2 2 0 2 2
nautical_twilight_end astronomical_twilight_end
122 951
Summary of the data
dim(ufo_sightings)
[1] 96429 12
summary(ufo_sightings)
reported_date_time reported_date_time_utc posted_date city state
Min. :1925-12-29 00:00:00.00 Min. :1925-12-29 00:00:00.00 Min. :1998-03-07 Length:96429 Length:96429
1st Qu.:2004-10-01 05:10:00.00 1st Qu.:2004-10-01 05:10:00.00 1st Qu.:2006-10-30 Class :character Class :character
Median :2012-02-05 03:00:00.00 Median :2012-02-05 03:00:00.00 Median :2012-08-19 Mode :character Mode :character
Mean :2009-04-30 02:41:30.98 Mean :2009-04-30 02:41:30.98 Mean :2011-09-26
3rd Qu.:2016-01-25 03:30:00.00 3rd Qu.:2016-01-25 03:30:00.00 3rd Qu.:2016-07-15
Max. :2023-05-18 19:27:00.00 Max. :2023-05-18 19:27:00.00 Max. :2023-05-19
country_code shape reported_duration duration_seconds summary has_images day_part
Length:96429 Length:96429 Length:96429 Min. :0.000e+00 Length:96429 Mode :logical Length:96429
Class :character Class :character Class :character 1st Qu.:3.000e+01 Class :character FALSE:96429 Class :character
Mode :character Mode :character Mode :character Median :1.800e+02 Mode :character Mode :character
Mean :3.161e+04
3rd Qu.:6.000e+02
Max. :1.987e+09
dim(places)
[1] 14417 10
summary(places)
city alternate_city_names state country country_code latitude longitude
Length:14417 Length:14417 Length:14417 Length:14417 Length:14417 Min. :-53.15 Min. :-170.48
Class :character Class :character Class :character Class :character Class :character 1st Qu.: 34.99 1st Qu.: -95.46
Mode :character Mode :character Mode :character Mode :character Mode :character Median : 40.09 Median : -84.21
Mean : 37.76 Mean : -75.36
3rd Qu.: 42.96 3rd Qu.: -74.82
Max. : 70.64 Max. : 179.19
timezone population elevation_m
Length:14417 Min. : 0 Min. : -57.0
Class :character 1st Qu.: 1926 1st Qu.: 65.0
Mode :character Median : 6085 Median : 194.0
Mean : 86375 Mean : 288.2
3rd Qu.: 21993 3rd Qu.: 304.0
Max. :22315474 Max. :3097.0
NA's :2285
dim(day_parts_map)
[1] 26409 12
summary(day_parts_map)
rounded_lat rounded_long rounded_date astronomical_twilight_begin nautical_twilight_begin civil_twilight_begin
Min. :-50.00 Min. :-170.00 Min. :1925-12-27 Length:26409 Length:26409 Length:26409
1st Qu.: 30.00 1st Qu.:-110.00 1st Qu.:1999-01-17 Class1:hms Class1:hms Class1:hms
Median : 40.00 Median : -90.00 Median :2007-03-25 Class2:difftime Class2:difftime Class2:difftime
Mean : 36.23 Mean : -80.01 Mean :2004-03-18 Mode :numeric Mode :numeric Mode :numeric
3rd Qu.: 40.00 3rd Qu.: -80.00 3rd Qu.:2014-10-12
Max. : 70.00 Max. : 180.00 Max. :2023-05-21
sunrise solar_noon sunset civil_twilight_end nautical_twilight_end astronomical_twilight_end
Length:26409 Length:26409 Length:26409 Length:26409 Length:26409 Length:26409
Class1:hms Class1:hms Class1:hms Class1:hms Class1:hms Class1:hms
Class2:difftime Class2:difftime Class2:difftime Class2:difftime Class2:difftime Class2:difftime
Mode :numeric Mode :numeric Mode :numeric Mode :numeric Mode :numeric Mode :numeric
sum(duplicated(ufo_sightings))
[1] 3
Due to the large size of the file on which the analysis is performed, only some of the data was used for visual representation.
library(tidyverse)
library(here)
library(withr)
library(naniar)
ufo_sightings %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
Replacing missing data with most frequent entry and saving cleaned data
most_common_day_part <- ufo_sightings %>%
count(day_part) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(day_part)
most_common_shape <- ufo_sightings %>%
count(shape) %>%
arrange(desc(n)) %>%
slice(1) %>%
pull(shape)
# najczestrza wartosc
ufo_clean <- ufo_sightings %>%
mutate(
day_part = ifelse(is.na(day_part), most_common_day_part, day_part),
shape = ifelse(is.na(shape), most_common_shape, shape)
)
write_csv(ufo_clean, here("data", "2023", "2023-06-20", "ufo_clean.csv"))
ufo_clean %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
places %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
Replacing missing city name with empty string, elevation with mean value and saving cleaned data
#nie mam lepszego pomysłu na uzuepłenienie niż " "
places_clean <- places %>%
mutate(
alternate_city_names = ifelse(is.na(alternate_city_names), " ", alternate_city_names)
)
median_elevation <- median(places$elevation_m, na.rm = TRUE)
places_clean <- places_clean %>%
mutate(
elevation_m = ifelse(is.na(elevation_m), median_elevation, elevation_m)
)
write_csv(places_clean, here("data", "2023", "2023-06-20", "places_clean.csv"))
places_clean %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
day_parts_map %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
Replacing missing data with mean value and saving cleaned data
day_parts_clean <- day_parts_map %>%
mutate(
astronomical_twilight_begin = ifelse(
is.na(astronomical_twilight_begin),
median(astronomical_twilight_begin, na.rm = TRUE),
astronomical_twilight_begin
),
astronomical_twilight_end = ifelse(
is.na(astronomical_twilight_end),
median(astronomical_twilight_end, na.rm = TRUE),
astronomical_twilight_end
)
)
write_csv(day_parts_clean, here("data", "2023", "2023-06-20", "day_parts_clean.csv"))
day_parts_clean %>%
dplyr::slice_sample(n = 1000) %>%
vis_miss(cluster = TRUE, sort_miss = TRUE)
ufo_model_data <- ufo_clean %>%
filter(!is.na(shape), !is.na(reported_duration), !is.na(summary))
places_model_data <- places_clean %>%
filter(!is.na(alternate_city_names))
glimpse(ufo_model_data)
Rows: 96,398
Columns: 12
$ reported_date_time <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-07-22 16:00:00, 2022-07-19 16…
$ reported_date_time_utc <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-07-22 16:00:00, 2022-07-19 16…
$ posted_date <date> 2022-09-09, 2022-10-08, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09…
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "Greybull", "North Conway", "P…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM", "CA", "NC", "OK", "CA", "N…
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US", "US", "US", "US", "US", "U…
$ shape <chr> "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "ligh…
$ reported_duration <chr> "15 mins\u0085", "1 minute", "2 hours", "30 seconds", "3 minutes", "10 minutes", "20 seconds", "5 minutes", "90 - 120 seconds", "20-30 minut…
$ duration_seconds <dbl> 900, 60, 172800, 30, 180, 600, 20, 300, 120, 1800, 10, 3, 45, 60, 240, 32, 300, 600, 180, 1200, 45, 300, 180, 1200, 300, 7200, 47, 1800, 180…
$ summary <chr> "Saw multi color object above horizon.", "An object in the shape of a straight line about an inch from our viewing area moving slowly across…
$ has_images <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ day_part <chr> "night", "nautical dusk", "night", "afternoon", "night", "morning", "morning", "afternoon", "night", "astronomical dusk", "afternoon", "morn…
glimpse(places_model_data)
Rows: 14,417
Columns: 10
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "Greybull", "North Conway", "Pho…
$ alternate_city_names <chr> "Pajnkherst,bynhwrst,pynhwrst karwlynay shmaly,Пајнхерст,بينهورست,پینهورست، کارولینای شمالی", " ", "CLE,Cleaveland,Cleveland,Forest City,Klev…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM", "CA", "NC", "OK", "CA", "NV"…
$ country <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "Australia", "USA", "USA", "USA", "USA", "USA", "India", "USA", "USA", "USA", "USA", "…
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US", "US", "US", "US", "US", "US"…
$ latitude <dbl> 35.19543, 44.83445, 41.49950, 39.16533, 33.66946, 35.33951, 37.65042, 41.11760, -33.53233, 44.48912, 44.05368, 33.44838, 26.14036, 43.39012, 2…
$ longitude <dbl> -79.46948, -85.28256, -81.69541, -86.52639, -117.82311, -97.48670, -77.61249, -73.40790, 149.25367, -108.05621, -71.12840, -112.07404, -80.213…
$ timezone <chr> "America/New_York", "America/Detroit", "America/New_York", "America/Indiana/Indianapolis", "America/Los_Angeles", "America/Chicago", "America/…
$ population <dbl> 15752, 1352, 388072, 84067, 256927, 60451, 24729, 88485, 3355, 1879, 2349, 1608139, 71579, 7912, 197340, 559121, 71035, 6137, 25892, 3898747, …
$ elevation_m <dbl> 160, 192, 199, 235, 17, 382, 89, 11, 194, 1156, 160, 331, 1, 157, 194, 1511, 414, 2, 393, 89, 1373, 264, 1, 89, 220, 287, 798, 219, 8, 1820, 5…
glimpse(day_parts_clean)
Rows: 26,409
Columns: 12
$ rounded_lat <dbl> 40, 40, 40, 40, 30, 40, 40, 40, -30, 40, 40, 30, 30, 40, 30, 40, 30, 30, 40, 30, 40, 30, 40, 30, 50, 40, 20, 30, 30, 30, 30, 30, 30, 30…
$ rounded_long <dbl> -80, -90, -80, -90, -120, -100, -80, -70, 150, -110, -70, -110, -80, -120, 80, -110, -120, -80, -100, -120, -120, -120, -80, -80, -120,…
$ rounded_date <date> 2022-08-28, 2022-08-21, 2022-08-14, 2022-08-07, 2022-08-07, 2022-07-24, 2022-07-17, 2022-07-17, 2022-07-10, 2022-07-10, 2022-07-10, 20…
$ astronomical_twilight_begin <dbl> 32863, 34713, 31734, 33540, 42902, 34745, 29379, 26977, 70259, 36085, 26478, 39124, 31660, 38079, 79348, 35471, 40996, 31561, 33859, 41…
$ nautical_twilight_begin <time> 09:42:49, 10:14:53, 09:26:40, 09:58:23, 12:26:50, 10:22:03, 08:54:27, 08:14:25, 20:00:01, 10:47:51, 08:07:45, 11:26:57, 09:23:03, 11:2…
$ civil_twilight_begin <time> 10:16:18, 10:49:12, 10:01:56, 10:34:41, 12:57:22, 11:00:33, 09:34:01, 08:54:00, 20:29:37, 11:28:21, 08:48:15, 11:59:43, 09:56:11, 12:0…
$ sunrise <time> 10:42:53, 11:16:15, 10:29:32, 11:02:53, 13:21:39, 11:30:01, 10:04:06, 09:24:04, 20:54:22, 11:58:55, 09:18:51, 12:25:25, 10:22:07, 12:3…
$ solar_noon <time> 17:21:10, 18:03:05, 17:24:39, 18:05:45, 20:05:45, 18:46:32, 17:26:12, 16:46:12, 02:05:20, 19:25:26, 16:45:26, 19:25:26, 17:24:18, 20:0…
$ sunset <time> 23:59:27, 00:49:55, 00:19:45, 01:08:37, 02:49:51, 02:03:03, 00:48:18, 00:08:20, 07:16:18, 02:51:58, 00:12:00, 02:25:28, 00:26:29, 03:3…
$ civil_twilight_end <time> 00:26:02, 01:16:58, 00:47:21, 01:36:50, 03:14:07, 02:32:31, 01:18:23, 00:38:24, 07:41:03, 03:22:32, 00:42:36, 02:51:10, 00:52:25, 04:0…
$ nautical_twilight_end <time> 00:59:31, 01:51:17, 01:22:37, 02:13:08, 03:44:39, 03:11:02, 01:57:57, 01:17:59, 08:10:40, 04:03:02, 01:23:06, 03:23:56, 01:25:33, 04:4…
$ astronomical_twilight_end <dbl> 5677, 8857, 7224, 10351, 15387, 14040, 9765, 7367, 31182, 17368, 7773, 14329, 7256, 20039, 55211, 17157, 16433, 6532, 13744, 15427, 179…
write_csv(ufo_model_data, here("data", "2023", "2023-06-20", "ufo_model_data.csv"))
write_csv(places_model_data, here("data", "2023", "2023-06-20", "places_model_data.csv"))
library(lubridate)
library(dplyr)
library(hms)
Dołączanie pakietu: ‘hms’
Następujący obiekt został zakryty z ‘package:lubridate’:
hms
Adding new columns to “sightings” dataframe
ufo_model_data_mutated <- ufo_model_data %>%
mutate(
year = year(reported_date_time),
month = month(reported_date_time),
weekday = wday(reported_date_time, label = TRUE, abbr = FALSE, locale = "C"),
is_weekend = weekday %in% c("Sat", "Sun"),
country_upper = toupper(country_code),
report_hour = hour(reported_date_time),
city_state = paste(city, state, sep = ", "),
report_delay_days = as.numeric(difftime(posted_date, as.Date(reported_date_time), units = "days"))
)
ufo_model_data_mutated
year: Extracts the year from the
reported_date_time.month: Extracts the month (1–12) from the report
time-stamp.weekday: Returns the weekday name from the date.is_weekend: Logical column: TRUE if the
day is Saturday or Sunday, FALSE otherwise.country_upper: Converts the country_code
to uppercase.report_hour: Extracts the hour (0–23) from the report
time-stamp.city_state: Concatenates city and
state into a single string.report_delay_days: Calculates the delay in days between
when the event was reported and when it was posted.Adding new columns to “places” dataframe
places_model_data_mutated <- places_model_data %>%
mutate(
city_state = paste(city, state, sep = ", "),
is_us = country_code == "US",
population_log = log1p(population),
hemisphere = ifelse(latitude >= 0, "Northern", "Southern"),
is_coastal = abs(longitude) < 80 | abs(longitude) > 120,
pop_category = case_when(
population < 10000 ~ "small",
population < 100000 ~ "medium",
TRUE ~ "large"
),
elevation_category = case_when(
is.na(elevation_m) ~ "unknown",
elevation_m < 100 ~ "low",
elevation_m < 500 ~ "medium",
TRUE ~ "high"
),
name_length = nchar(city),
timezone_area = sapply(strsplit(timezone, "/"), `[`, 2)
)
places_model_data_mutated
city_state: Combines city and
state into a single string.is_us: Logical value: TRUE if the location
is in the United States else FALSE.population_log: Log-transformed population.hemisphere: "Northern" if latitude is ≥ 0,
"Southern" otherwise.is_coastal: Logical: TRUE if longitude is
outside the range [80, 120] in absolute value — a rough coastal
proxy.pop_category: Categorizes places based on population:
"small", "medium", or
"large".elevation_category: Classifies elevation:
"low" (<100 m), "medium" (<500 m),
"high" (≥500 m), or "unknown" if NA.name_length: The number of characters in the city
name.timezone_area: Extracts the second part of the timezone
string.Adding new columns to “day parts” dataframe
day_parts_model_mutated <- day_parts_clean %>%
mutate(
daylight_duration = as.numeric(sunset - sunrise, units = "secs"),
is_northern_hemisphere = rounded_lat >= 0,
sunrise_hour = hour(sunrise),
sunset_hour = hour(sunset),
is_day_short = daylight_duration < 36000, # mniej niż 10h
twilight_duration = as.numeric(astronomical_twilight_end - astronomical_twilight_begin, units = "secs"),
is_long_twilight = twilight_duration > 5400, # 1.5h
sunrise_minutes = hour(sunrise) * 60 + minute(sunrise),
solar_noon_minutes = hour(solar_noon) * 60 + minute(solar_noon),
sunset_minutes = hour(sunset) * 60 + minute(sunset)
)
day_parts_model_mutated
daylight_duration: The length of the day in seconds —
difference between sunset and sunrise.is_northern_hemisphere: Logical: TRUE if
the location is in the Northern Hemisphere.sunrise_hour: The hour (0–23) when the sun rises.sunset_hour: The hour (0–23) when the sun sets.is_day_short: Logical: TRUE if the day is
shorter than 10 hourstwilight_duration: Duration of astronomical twilight in
seconds — time between astronomical_twilight_begin and
end.is_long_twilight: Logical: TRUE if
twilight duration is longer than 1.5 hourssunrise_minutes: Sunrise time in total minutes from
midnight.solar_noon_minutes: Solar noon time in minutes from
midnight.sunset_minutes: Sunset time in minutes from
midnight.glimpse(ufo_model_data_mutated)
Rows: 96,398
Columns: 20
$ reported_date_time <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-07-22 16:00:00, 2022-07-19 16…
$ reported_date_time_utc <dttm> 2022-08-29 06:03:00, 2022-08-20 01:51:00, 2022-08-13 05:30:00, 2022-08-06 21:00:00, 2022-08-04 07:40:00, 2022-07-22 16:00:00, 2022-07-19 16…
$ posted_date <date> 2022-09-09, 2022-10-08, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09-09, 2022-09…
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "Greybull", "North Conway", "P…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM", "CA", "NC", "OK", "CA", "N…
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US", "US", "US", "US", "US", "U…
$ shape <chr> "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "light", "ligh…
$ reported_duration <chr> "15 mins\u0085", "1 minute", "2 hours", "30 seconds", "3 minutes", "10 minutes", "20 seconds", "5 minutes", "90 - 120 seconds", "20-30 minut…
$ duration_seconds <dbl> 900, 60, 172800, 30, 180, 600, 20, 300, 120, 1800, 10, 3, 45, 60, 240, 32, 300, 600, 180, 1200, 45, 300, 180, 1200, 300, 7200, 47, 1800, 180…
$ summary <chr> "Saw multi color object above horizon.", "An object in the shape of a straight line about an inch from our viewing area moving slowly across…
$ has_images <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ day_part <chr> "night", "nautical dusk", "night", "afternoon", "night", "morning", "morning", "afternoon", "night", "astronomical dusk", "afternoon", "morn…
$ year <dbl> 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 2022, 20…
$ month <dbl> 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 12, 12, 11, 11…
$ weekday <ord> Monday, Saturday, Saturday, Saturday, Thursday, Friday, Tuesday, Thursday, Wednesday, Wednesday, Friday, Thursday, Tuesday, Saturday, Friday…
$ is_weekend <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ country_upper <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US", "US", "US", "US", "US", "U…
$ report_hour <int> 6, 1, 5, 21, 7, 16, 16, 18, 19, 4, 0, 16, 2, 16, 17, 2, 22, 20, 2, 23, 6, 9, 1, 2, 12, 1, 6, 2, 2, 7, 20, 22, 1, 2, 23, 5, 1, 0, 0, 4, 0, 4,…
$ city_state <chr> "Pinehurst, NC", "Rapid City, MI", "Cleveland, OH", "Bloomington, IN", "Irvine, CA", "Moore, OK", "Short Pump, VA", "Norwalk, CT", "Blayney,…
$ report_delay_days <dbl> 11, 49, 27, 34, 36, 49, 52, 57, 58, 58, 63, 64, 66, 69, 5, 15, 17, 23, 32, 35, 38, 30, 0, 67, 7, 13, 22, 24, 24, 29, 31, 181, 45, 46, 91, 8,…
glimpse(places_model_data_mutated)
Rows: 14,417
Columns: 19
$ city <chr> "Pinehurst", "Rapid City", "Cleveland", "Bloomington", "Irvine", "Moore", "Short Pump", "Norwalk", "Blayney", "Greybull", "North Conway", "Pho…
$ alternate_city_names <chr> "Pajnkherst,bynhwrst,pynhwrst karwlynay shmaly,Пајнхерст,بينهورست,پینهورست، کارولینای شمالی", " ", "CLE,Cleaveland,Cleveland,Forest City,Klev…
$ state <chr> "NC", "MI", "OH", "IN", "CA", "OK", "VA", "CT", "New South Wales", "WY", "NH", "AZ", "FL", "OR", "Haryana", "NM", "CA", "NC", "OK", "CA", "NV"…
$ country <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "USA", "Australia", "USA", "USA", "USA", "USA", "USA", "India", "USA", "USA", "USA", "USA", "…
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US", "US", "AU", "US", "US", "US", "US", "US", "IN", "US", "US", "US", "US", "US", "US", "US", "US", "US"…
$ latitude <dbl> 35.19543, 44.83445, 41.49950, 39.16533, 33.66946, 35.33951, 37.65042, 41.11760, -33.53233, 44.48912, 44.05368, 33.44838, 26.14036, 43.39012, 2…
$ longitude <dbl> -79.46948, -85.28256, -81.69541, -86.52639, -117.82311, -97.48670, -77.61249, -73.40790, 149.25367, -108.05621, -71.12840, -112.07404, -80.213…
$ timezone <chr> "America/New_York", "America/Detroit", "America/New_York", "America/Indiana/Indianapolis", "America/Los_Angeles", "America/Chicago", "America/…
$ population <dbl> 15752, 1352, 388072, 84067, 256927, 60451, 24729, 88485, 3355, 1879, 2349, 1608139, 71579, 7912, 197340, 559121, 71035, 6137, 25892, 3898747, …
$ elevation_m <dbl> 160, 192, 199, 235, 17, 382, 89, 11, 194, 1156, 160, 331, 1, 157, 194, 1511, 414, 2, 393, 89, 1373, 264, 1, 89, 220, 287, 798, 219, 8, 1820, 5…
$ city_state <chr> "Pinehurst, NC", "Rapid City, MI", "Cleveland, OH", "Bloomington, IN", "Irvine, CA", "Moore, OK", "Short Pump, VA", "Norwalk, CT", "Blayney, N…
$ is_us <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TR…
$ population_log <dbl> 9.664786, 7.210080, 12.868949, 11.339381, 12.456551, 11.009605, 10.115772, 11.390600, 8.118505, 7.539027, 7.762171, 14.290589, 11.178571, 8.97…
$ hemisphere <chr> "Northern", "Northern", "Northern", "Northern", "Northern", "Northern", "Northern", "Northern", "Southern", "Northern", "Northern", "Northern"…
$ is_coastal <lgl> TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FAL…
$ pop_category <chr> "medium", "small", "large", "medium", "large", "medium", "medium", "medium", "small", "small", "small", "large", "medium", "small", "large", "…
$ elevation_category <chr> "medium", "medium", "medium", "medium", "low", "medium", "low", "low", "medium", "high", "medium", "medium", "low", "medium", "medium", "high"…
$ name_length <int> 9, 10, 9, 11, 6, 5, 10, 7, 7, 8, 12, 7, 10, 9, 7, 11, 8, 14, 5, 11, 4, 5, 14, 13, 5, 7, 10, 7, 7, 7, 12, 5, 10, 9, 6, 10, 12, 6, 7, 7, 12, 9, …
$ timezone_area <chr> "New_York", "Detroit", "New_York", "Indiana", "Los_Angeles", "Chicago", "New_York", "New_York", "Sydney", "Denver", "New_York", "Phoenix", "Ne…
glimpse(day_parts_clean)
Rows: 26,409
Columns: 12
$ rounded_lat <dbl> 40, 40, 40, 40, 30, 40, 40, 40, -30, 40, 40, 30, 30, 40, 30, 40, 30, 30, 40, 30, 40, 30, 40, 30, 50, 40, 20, 30, 30, 30, 30, 30, 30, 30…
$ rounded_long <dbl> -80, -90, -80, -90, -120, -100, -80, -70, 150, -110, -70, -110, -80, -120, 80, -110, -120, -80, -100, -120, -120, -120, -80, -80, -120,…
$ rounded_date <date> 2022-08-28, 2022-08-21, 2022-08-14, 2022-08-07, 2022-08-07, 2022-07-24, 2022-07-17, 2022-07-17, 2022-07-10, 2022-07-10, 2022-07-10, 20…
$ astronomical_twilight_begin <dbl> 32863, 34713, 31734, 33540, 42902, 34745, 29379, 26977, 70259, 36085, 26478, 39124, 31660, 38079, 79348, 35471, 40996, 31561, 33859, 41…
$ nautical_twilight_begin <time> 09:42:49, 10:14:53, 09:26:40, 09:58:23, 12:26:50, 10:22:03, 08:54:27, 08:14:25, 20:00:01, 10:47:51, 08:07:45, 11:26:57, 09:23:03, 11:2…
$ civil_twilight_begin <time> 10:16:18, 10:49:12, 10:01:56, 10:34:41, 12:57:22, 11:00:33, 09:34:01, 08:54:00, 20:29:37, 11:28:21, 08:48:15, 11:59:43, 09:56:11, 12:0…
$ sunrise <time> 10:42:53, 11:16:15, 10:29:32, 11:02:53, 13:21:39, 11:30:01, 10:04:06, 09:24:04, 20:54:22, 11:58:55, 09:18:51, 12:25:25, 10:22:07, 12:3…
$ solar_noon <time> 17:21:10, 18:03:05, 17:24:39, 18:05:45, 20:05:45, 18:46:32, 17:26:12, 16:46:12, 02:05:20, 19:25:26, 16:45:26, 19:25:26, 17:24:18, 20:0…
$ sunset <time> 23:59:27, 00:49:55, 00:19:45, 01:08:37, 02:49:51, 02:03:03, 00:48:18, 00:08:20, 07:16:18, 02:51:58, 00:12:00, 02:25:28, 00:26:29, 03:3…
$ civil_twilight_end <time> 00:26:02, 01:16:58, 00:47:21, 01:36:50, 03:14:07, 02:32:31, 01:18:23, 00:38:24, 07:41:03, 03:22:32, 00:42:36, 02:51:10, 00:52:25, 04:0…
$ nautical_twilight_end <time> 00:59:31, 01:51:17, 01:22:37, 02:13:08, 03:44:39, 03:11:02, 01:57:57, 01:17:59, 08:10:40, 04:03:02, 01:23:06, 03:23:56, 01:25:33, 04:4…
$ astronomical_twilight_end <dbl> 5677, 8857, 7224, 10351, 15387, 14040, 9765, 7367, 31182, 17368, 7773, 14329, 7256, 20039, 55211, 17157, 16433, 6532, 13744, 15427, 179…
library(ggplot2)
library(dplyr)
library(sf)
Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
library(rnaturalearthdata)
Dołączanie pakietu: ‘rnaturalearthdata’
Następujący obiekt został zakryty z ‘package:rnaturalearth’:
countries110
Number of sightings per day
ufo_model_data_mutated %>%
count(date = as.Date(reported_date_time)) %>%
ggplot(aes(x = date, y = n)) +
geom_line(color = "steelblue") +
labs(title = "Number of sightings per day", x = "Date", y = "Number of sightings")
Number of sightings depending on the day of the week
ufo_model_data_mutated %>%
count(weekday) %>%
ggplot(aes(x = weekday, y = n)) +
geom_col(fill = "orange") +
labs(title = "Sightings depending on the day of the week", x = "Day of the week", y = "Number of sightings")
Hourly distribution of sightings
ufo_model_data_mutated %>%
mutate(hour = hour(reported_date_time)) %>%
count(hour) %>%
ggplot(aes(x = hour, y = n)) +
geom_col(fill = "purple") +
labs(title = "Hourly distribution of sightings", x = "Hour of the day", y = "Number of sightings")
Heatmap: day of the week vs hour of the day
ufo_model_data_mutated %>%
mutate(
hour = hour(reported_date_time),
weekday = fct_relevel(weekday, c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))
) %>%
count(weekday, hour) %>%
ggplot(aes(x = hour, y = weekday, fill = n)) +
geom_tile(color = "white") +
scale_fill_viridis_c() +
labs(title = "Heatmap: day of the week vs hour of the day", x = "Hour of the day", y = "Day of the week", fill = "Number of sightings")
Ostrzeżenie: There was 1 warning in `mutate()`.
ℹ In argument: `weekday = fct_relevel(...)`.
Caused by warning:
! 7 unknown levels in `f`: Mon, Tue, Wed, Thu, Fri, Sat, and Sun
ufo_model_data_mutated %>%
mutate(image_status = ifelse(has_images, "Has an image", "Has no image")) %>%
count(image_status) %>%
ggplot(aes(x = "", y = n, fill = image_status)) +
geom_col(width = 1) +
coord_polar(theta = "y") +
labs(title = "Sightins with images vs no image", fill = "Image existence") +
theme_void() +
scale_fill_manual(values = c("Has an image" = "#66BB6A", "Has no image" = "#EF5350"))
Making sure if above piechart is correct
sum(ufo_model_data_mutated$has_images != FALSE, na.rm = TRUE)
[1] 0
Number of sightings per shape
ufo_model_data_mutated %>%
count(shape) %>%
ggplot(aes(x = reorder(shape, n), y = n)) +
geom_col(fill = "skyblue") +
coord_flip() +
labs(title = "Number of sightings per shape", x = "Shape", y = "Number of sightings")
Number of sightings per country
ufo_model_data_mutated %>%
count(country_code) %>%
filter(n >= 100) %>%
ggplot(aes(x = reorder(country_code, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(
title = "Number of sightings per country",
x = "Country code",
y = "Number of sightings"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Density map of sightings
world <- ne_countries(scale = "medium", returnclass = "sf")
places_clean %>%
filter(!is.na(latitude), !is.na(longitude)) %>%
mutate(
pop_category = case_when(
population < 10000 ~ "small",
population < 100000 ~ "medium",
TRUE ~ "large"
)
) %>%
ggplot() +
geom_sf(data = world, fill = "lightgray", color = "black") + # Dodajemy mapę
geom_point(aes(
x = longitude, y = latitude,
size = population, color = pop_category
), alpha = 0.6) +
scale_size(range = c(1, 6), guide = "none") +
labs(
title = "Cities with UFO sightings",
subtitle = "Point size ~ population, color ~ population category",
x = "Latitude",
y = "Altitude",
color = "Population category"
) +
theme_minimal()